import json

# 读取原始训练集
with open('./packJson/train_plant.json', 'r', encoding='utf-8') as f:
    original_data = json.load(f)

# 统一问题答案
unique_questions = {}
for item in original_data:
    question = item["question"]
    answer = item["answer"]
    if question not in unique_questions:
        unique_questions[question] = answer
    else:
        # 这里可以根据需要选择更合适的答案，这里简单选择较长的答案
        if len(answer) > len(unique_questions[question]):
            unique_questions[question] = answer

# 添加结束标记
end_token = "<END>"
new_data = []
for question, answer in unique_questions.items():
    new_item = {
        "question": question,
        "answer": answer + end_token
    }
    new_data.append(new_item)

# 保存修改后的训练集
with open('packJson/train_plant2.json', 'w', encoding='utf-8') as f:
    json.dump(new_data, f, ensure_ascii=False, indent=4)